April 24th 2018

Story of making childhood dreams come true

  • What is a choropleth?

  • Where to get (legally) regions shapefiles?

  • Drawing maps with ggplot in R

Goal

Goal

  • We want to color of each region to reflect value of some variable (e.g. unemployment, gdp)
  • We need a definition of each region…
  • … and a way of joining regions with variables

What is a map and why having flat earth would make our lifes easier

  • map is a mathemtical image of a space that projects objects and geografical phenomena in scale and rectangular projection
  • set of cartographical signs with a purpose to show geographical objects and phenomena
  • reflects reality in a specificely generalized way – could be treated as a model for a reality

Good old Mercator

Gall–Peters projection

Robinson projection

Technical note

require(rgdal) #function readOGR
require(rgeos) #function gSimplify and spTransform
library(ggplot2) #function fortify
EU_NUTS <- readOGR("data/ref-nuts-2013-03m/", layer = "NUTS_RG_03M_2013_3035",
    encoding = "utf8")
## OGR data source with driver: ESRI Shapefile 
## Source: "/Users/piotr/szychta_w_danych/prezentacja_brug_18_04_24/data/ref-nuts-2013-03m", layer: "NUTS_RG_03M_2013_3035"
## with 1951 features
## It has 5 fields
#' we need to save the id, since we loose it when transforming into data.frame
EU_NUTS_codes <- data.frame(id = as.character(0:1950), 
    nuts_id = EU_NUTS@data$NUTS_ID, 
    stringsAsFactors = FALSE)

#' by simplifying we reduce file size and drawing time
EU_NUTS_simplified <- gSimplify(EU_NUTS, tol = 1000, topologyPreserve = TRUE)
#' changin to mercator (convienent to include eg. localization from google maps)
EU_NUTS_fortified <- spTransform(EU_NUTS_simplified, CRS("+proj=longlat +datum=WGS84"))
EU_NUTS_fortified <- fortify(EU_NUTS_fortified)

#' computing centroids, useful for making labels
centroids_countries = data.frame(gCentroid(EU_NUTS, byid = TRUE))
centroids_countries$id <- rownames((centroids_countries))

glimpse(EU_NUTS_fortified)
## Observations: 349,730
## Variables: 7
## $ long  <dbl> 22.99716, 23.05680, 23.26778, 23.43325, 23.51521, 23.581...
## $ lat   <dbl> 43.80763, 43.79581, 43.84681, 43.85057, 43.83107, 43.797...
## $ order <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1...
## $ hole  <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, ...
## $ piece <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ id    <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "...
## $ group <fct> 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0...

Let’s get some data from eurostat

library(eurostat)
gdp_data <- get_eurostat('tec00001', time_format = "num", stringsAsFactors = F) 
## Table tec00001 cached at /var/folders/bb/v3bv876j4nj7sk3ycjyqvhqm0000gn/T//RtmpzSrc9H/eurostat/tec00001_num_code_FF.rds
glimpse(gdp_data)
## Observations: 1,371
## Variables: 5
## $ na_item <chr> "B1GQ", "B1GQ", "B1GQ", "B1GQ", "B1GQ", "B1GQ", "B1GQ"...
## $ unit    <chr> "CP_EUR_HAB", "CP_EUR_HAB", "CP_EUR_HAB", "CP_EUR_HAB"...
## $ geo     <chr> "AL", "AT", "BE", "BG", "CH", "CY", "CZ", "DE", "DK", ...
## $ time    <dbl> 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, ...
## $ values  <dbl> 2400, 32400, 31000, 3500, 45600, 21700, 12100, 29500, ...

gdp_data <- gdp_data %>%
  filter(unit == 'CP_EUR_HAB', nchar(as.character(geo)) == 2) %>%
  mutate(unit = factor(unit))

gdp_data_labels <- label_eurostat(gdp_data, fix_duplicated = T)
glimpse(gdp_data_labels)
## Observations: 418
## Variables: 5
## $ na_item <chr> "Gross domestic product at market prices", "Gross dome...
## $ unit    <fct> Current prices, euro per capita, Current prices, euro ...
## $ geo     <chr> "Albania", "Austria", "Belgium", "Bulgaria", "Switzerl...
## $ time    <dbl> 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, ...
## $ values  <dbl> 2400, 32400, 31000, 3500, 45600, 21700, 12100, 29500, ...

plot_data <- EU_NUTS_fortified %>%
    inner_join(EU_NUTS_codes, 
                         by = c("id")) %>%
    inner_join(gdp_data, 
                         by = c('nuts_id'='geo'))
glimpse(plot_data)
## Observations: 558,275
## Variables: 12
## $ long    <dbl> 22.99716, 22.99716, 22.99716, 22.99716, 22.99716, 22.9...
## $ lat     <dbl> 43.80763, 43.80763, 43.80763, 43.80763, 43.80763, 43.8...
## $ order   <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, ...
## $ hole    <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE...
## $ piece   <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ id      <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",...
## $ group   <fct> 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1,...
## $ nuts_id <chr> "BG", "BG", "BG", "BG", "BG", "BG", "BG", "BG", "BG", ...
## $ na_item <chr> "B1GQ", "B1GQ", "B1GQ", "B1GQ", "B1GQ", "B1GQ", "B1GQ"...
## $ unit    <fct> CP_EUR_HAB, CP_EUR_HAB, CP_EUR_HAB, CP_EUR_HAB, CP_EUR...
## $ time    <dbl> 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, ...
## $ values  <dbl> 3500, 4200, 4900, 4900, 5100, 5600, 5700, 5800, 5900, ...

First attempt

plot_data %>%
    filter(time == 2016) %>%
    ggplot(mapping = aes(x = long, y = lat)) +
  geom_polygon(mapping = aes(group = group, fill = values))

Making a map

plot_data %>%
    filter(time == 2016) %>%
    ggplot(mapping = aes(x = long, y = lat)) +
    scale_y_continuous(limits = c(33, 72)) +
    scale_x_continuous(limits = c(-25, 32)) +
        #drawing ploygons
  geom_polygon(mapping = aes(group = group, fill = values)) +
    #set coord system
    coord_map() +
        #set colors
  scale_fill_distiller("GDP", palette = "YlGn", 
                                breaks = pretty_breaks(n = 6), 
                                trans = "reverse") + 
  guides(fill = guide_legend(reverse = TRUE)) + 
    labs(title = "GDP per capita", 
             caption = "Based on eurostat data") + 
  theme_map(base_size = 18)

breaks_gdp = c(0, 10000, 20000, 30000, 40000,50000, 60000, 150000)
labels_gdp = c('less than 10K', '10-20K', '20-30K', '30-40K', '40-50K', '50-60K', '>60K')
plot_data %>%
    mutate(values = cut(values, breaks_gdp, labels = labels_gdp)) %>%
        filter(time %in% c(2006, 2016)) %>%
    ggplot(mapping = aes(x = long, y = lat)) +
    scale_y_continuous(limits = c(33, 72)) +
    scale_x_continuous(limits = c(-25, 32)) +
  geom_polygon(mapping = aes(group = group, fill = values)) + #draw polygons
    coord_map() +  #set coord system
    scale_fill_brewer("GDP", palette = "RdYlGn") +
    facet_wrap(~time, nrow = 1, ncol = 2) +
  guides(fill = guide_legend(reverse = TRUE)) + 
    labs(title = "GDP per capita", 
             caption = "Based on eurostat data") + 
  theme_map(base_size = 30, base_family = 'Helvetica Neue Light') +
  theme(title = element_text(hjust = 0.5),
            legend.position="right",
        legend.key.height = unit(1, "cm"),
        legend.key.width = unit(0.7, "cm"))

Cartograms

library(Rcartogram)
library(getcartr)
gdp_2016 <- filter(gdp_data, time == 2016)
EU_NUTS_countries <- EU_NUTS[EU_NUTS$LEVL_CODE == 0 & EU_NUTS$NUTS_ID %in% gdp_2016$geo,]
EU_NUTS_countries@data <- EU_NUTS_countries@data %>%
  inner_join(gdp_2016, by = c('NUTS_ID'='geo'))
## Warning: Column `NUTS_ID`/`geo` joining factor and character vector,
## coercing into character vector
EU_NUTS_cartogram <- quick.carto(spdf = EU_NUTS_countries, v = EU_NUTS_countries@data$values)

plot_data2 <- inner_join(fortify(EU_NUTS_cartogram), EU_NUTS_codes, 
                                             by = c("id")) %>%
    inner_join(gdp_2016, by = c('nuts_id'='geo'))

Tips and tricks

  • (relatively) new package sf - no need for playing with codes, tidyverse support
  • Finding the right projection proj4string(OBJECT_FROM_SHP)
  • Placing legend ‘inside plot’ theme(legened.position = c(0.1,0.1))
  • Adding labels to polygons geom_label